## Sourobh Ghosh, Casey Kearney, Mahdi Hashemian
## Gov 2001 Replication Project


require(readstata13)

setwd("C:/Documents CLK/Grad School/Spring 2016/Gov 2001/Replication Paper/Data/Data")  ## modify this as necessary

data = read.dta13("SovietCollaboration_MainDataset.dta")
data.Table5 = read.dta13("SovietCollaboration_CitationToSovietArtDataset.dta")  ## for Table 5
data.spec = read.dta13("SovietCollaboration_SpecializationDataset.dta") ## for Fig 7, Table 6, Table A8-A9

## dropping Soviet authors
newd = data[-which(data$sovietauthor == 1), ]

## Want to create a new variable of the percent of total publications for field

year.vector <- unique(newd$year)

year.vector <- sort(year.vector)
year.vector
year.vector <- cbind(year.vector,table(newd$year))
head(year.vector)
nrow(year.vector)
dim(year.vector)

subject.vector <- unique(newd$adjustedmsc)
subject.vector <- sort(subject.vector)
subject.vector

## want subject by Year to put into a lookup vector
placeholder <- rep(NA,length(subject.vector))

for (j in 1:nrow(year.vector)) {
  a <- year.vector[j]
  subject.placeholder.by.year <- rep(NA, length(subject.vector))
  for (i in 1:length(subject.vector)) {
    b <- subject.vector[i]
    c <- length(which(newd$year == a & newd$adjustedmsc==b))
    subject.placeholder.by.year[i] <- c
  }
  placeholder <- rbind(placeholder,subject.placeholder.by.year)
}

head(placeholder)
dim(placeholder)
length(which(newd$year == 1970 & newd$adjustedmsc == 5)) ## spot checking a value from the loop
subject.count.vector <- placeholder[-1,]
dim(subject.count.vector)
lookup.vector <- cbind(year.vector,subject.count.vector)

head(lookup.vector)

growth.popularity.placeholder <- rep(NA, nrow(newd))
for (i in 1:nrow(newd)) {
  a <- newd$year[i]
  if (a > 1970) {
    b <- newd$adjustedmsc[i]
    c <- lookup.vector[(a-1969),(which(subject.vector==b)+2)]/lookup.vector[(a-1970),(which(subject.vector==b)+2)]
    } else {
    c <- NA
  }
  growth.popularity.placeholder[i] <- c-1
}
length(growth.popularity.placeholder)
head(growth.popularity.placeholder)
head(newd)


newd <- cbind(newd,growth.popularity.placeholder)
head(newd)
##### robust cluster SEs function (http://www.r-bloggers.com/easy-clustered-standard-errors-in-r/)

get_CL_vcov<-function(model, cluster){
  require(sandwich)
  require(lmtest)
  
  #calculate degree of freedom adjustment
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- model$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  
  #calculate the uj's
  uj  <- apply(estfun(model),2, function(x) tapply(x, cluster, sum))
  
  #use sandwich to get the var-covar matrix
  vcovCL <- dfc*sandwich(model, meat=crossprod(uj)/N)
  return(vcovCL)
}


newt3b3 = subset(newd, newbdrankingposition<=3 | newbdrankingposition>=31, 
                 select=c(logauthourcount, newbdrankingposition, afterdummy, adjustedmsc, year,
                           growth.popularity.placeholder))

head(newt3b3)

## creating dummy for topsoviet
topsovietdummy = rep(0,nrow(newt3b3))
for (i in 1:nrow(newt3b3)){  if (newt3b3$newbdrankingposition[i] < 4){topsovietdummy[i] = 1} }

t3b3 = cbind(newt3b3,topsovietdummy)
t3b3 <- t3b3[-(which(t3b3$year==1970)),]
head(t3b3)
nrow(t3b3)
fit = lm(logauthourcount ~ topsovietdummy:afterdummy 
         + factor(adjustedmsc) + factor(year), data = t3b3)  ## OLS with fixed effects 

## call robust cluster SE function and save the var-cov matrix output in an object
m1.vcovCL<-get_CL_vcov(fit, t3b3$adjustedmsc)

## results
coeftest(fit, m1.vcovCL)  ## coefficient on AfterIronCurtain x SovietRich and corresponding robust cluster SE
summary(fit)$adj.r.squared  ## R-squared

fit2 = lm(logauthourcount ~ topsovietdummy:afterdummy 
          + factor(adjustedmsc) + factor(year) + growth.popularity.placeholder, data = t3b3) 

m2.vcovCL<-get_CL_vcov(fit2, t3b3$adjustedmsc)

## results
coeftest(fit2, m2.vcovCL)  ## coefficient on AfterIronCurtain x SovietRich and corresponding robust cluster SE
summary(fit2)$adj.r.squared  ## R-squared


## 2 Year Compound Growth Rate Popularity Predictor

lag.popularity.placeholder <- rep(NA, nrow(newd))
for (i in 1:nrow(newd)) {
  a <- newd$year[i]
  if (a > 1971) {
    b <- newd$adjustedmsc[i]
    c <- (lookup.vector[(a-1969),(which(subject.vector==b)+2)]/(lookup.vector[(a-1971),(which(subject.vector==b)+2)]))^0.5 - 1
  } else {
    c <- NA
  }
  lag.popularity.placeholder[i] <- c
}

head(lag.popularity.placeholder)
head(newd)
length(which(newd$year == 1970 | newd$year == 1971))

length(lag.popularity.placeholder)
test <- length(which(is.na(lag.popularity.placeholder)==T))
test ## 14787, matching the sum observations for 1970 and 1971, as expected
  newd.lag <- cbind(newd,lag.popularity.placeholder)
  head(newd.lag)
  
  newd.lag <- newd.lag[-which(newd.lag$year==1970|newd.lag$year==1971),] ## drop 1970 and 1971 observations
  
  newt3b3.lag = subset(newd.lag, newbdrankingposition<=3 | newbdrankingposition>=31, 
                   select=c(logauthourcount, newbdrankingposition, afterdummy, adjustedmsc, year,
                            lag.popularity.placeholder))
  
  ## creating dummy for topsoviet
  topsovietdummy = rep(0,nrow(newt3b3.lag))
  for (i in 1:nrow(newt3b3.lag)){  if (newt3b3.lag$newbdrankingposition[i] < 4){topsovietdummy[i] = 1} }
  
  t3b3.lag = cbind(newt3b3.lag,topsovietdummy)
  head(t3b3.lag)
  nrow(t3b3.lag)
  fit.lag = lm(logauthourcount ~ topsovietdummy:afterdummy 
           + factor(adjustedmsc) + factor(year), data = t3b3.lag)  ## OLS with fixed effects
  
  head(t3b3.lag)
  
  ## call robust cluster SE function and save the var-cov matrix output in an object
  m1.vcovCL.lag<-get_CL_vcov(fit.lag, t3b3.lag$adjustedmsc)
  
  ## results
  coeftest(fit.lag, m1.vcovCL.lag)  ## coefficient on AfterIronCurtain x SovietRich and corresponding robust cluster SE
  summary(fit.lag)$adj.r.squared  ## R-squared
  
  fit2.lag = lm(logauthourcount ~ topsovietdummy:afterdummy 
            + factor(adjustedmsc) + factor(year) + lag.popularity.placeholder, data = t3b3.lag) 
  
  m2.vcovCL.lag<-get_CL_vcov(fit2.lag, t3b3.lag$adjustedmsc)
  
  ## results
  coeftest(fit2.lag, m2.vcovCL.lag)  ## coefficient on AfterIronCurtain x SovietRich and corresponding robust cluster SE
  summary(fit2.lag)$adj.r.squared  ## R-squared
  
  top.only.data <- subset(t3b3.lag, newbdrankingposition <= 3)
  head(top.only.data)
  nrow(top.only.data)
  top.only.data$lag.popularity.placeholder
  plot(top.only.data$year, top.only.data$lag.popularity.placeholder)
  plot(t3b3.lag$year,t3b3.lag$lag.popularity.placeholder)
  hist(lag.popularity.placeholder)
  hist(lag.popularity.placeholder+1)
  hist(log(top.only.data$lag.popularity.placeholder+2))
  